jan_human_barcodes <- gem_class_jan %>% 
  filter(call == 'hg19') %>% 
  pull(barcode)

jan_mouse_barcodes <- gem_class_jan %>% 
  filter(call == 'mm10') %>% 
  pull(barcode)

aug_human_barcodes <- gem_class_aug %>% 
  filter(call == 'hg19') %>% 
  pull(barcode)

aug_mouse_barcodes <- gem_class_aug %>% 
  filter(call == 'mm10') %>% 
  pull(barcode)
if (curr_spec == "human"){
  data1 = Read10X(data.dir = filt_aug_hg19)
  data1 = data1[,colnames(data1) %in% aug_human_barcodes]
  data2 = Read10X(data.dir = filt_jan_hg19)
  data2 = data2[,colnames(data2) %in% jan_human_barcodes]
  
  current_data1 = CreateSeuratObject(data1, project = "Day 8", min.cells = 3, min.features = 200)
  current_data2 = CreateSeuratObject(data2, project = "Day 0", min.cells = 3, min.features = 200)
  current_data = merge(current_data1, current_data2, add.cell.ids = c("Day 8", "Day 0"), project = "all human")
  current_data[['percent.mt']] = PercentageFeatureSet(current_data, pattern = "^MT-")
}
if (curr_spec == "mouse"){
  data1 = Read10X(data.dir = filt_aug_mm10)
  data1 = data1[,colnames(data1) %in% aug_mouse_barcodes]
  data2 = Read10X(data.dir = filt_jan_mm10)
  data2 = data2[,colnames(data2) %in% jan_mouse_barcodes]
  
  current_data1 = CreateSeuratObject(data1, project = "Day 5", min.cells = 3, min.features = 200)
  current_data2 = CreateSeuratObject(data2, project = "Day 0", min.cells = 3, min.features = 200)
  current_data = merge(current_data1, current_data2, add.cell.ids = c("Day 5", "Day 0"), project = "all mouse")
  current_data[['percent.mt']] = PercentageFeatureSet(current_data, pattern = "^mt-")
  
}
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')

## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
# VlnPlot(current_data, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
VlnPlot(current_data, features = c("nFeature_RNA", "nCount_RNA"), ncol = 2, pt.size = 0)

#normalize data
current_data <- NormalizeData(current_data, normalization.method = "LogNormalize", scale.factor = 10000)
current_data <- FindVariableFeatures(current_data, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(current_data), 10)
plot1 <- VariableFeaturePlot(current_data)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
all.genes <- rownames(current_data)
current_data <- ScaleData(current_data, features = all.genes)
## Centering and scaling data matrix
current_data <- RunPCA(current_data, features = VariableFeatures(object = current_data))
## PC_ 1 
## Positive:  mm10-S100a6, mm10-Hmga2, mm10-Tuba1b, mm10-Tnfrsf12a, mm10-Npm1, mm10-Anxa3, mm10-Tuba1a, mm10-Tagln2, mm10-Tmsb4x, mm10-Actg1 
##     mm10-Ccnd1, mm10-Cfl1, mm10-Tm4sf1, mm10-Tpm1, mm10-Zyx, mm10-Txn1, mm10-Ranbp1, mm10-Timp1, mm10-Birc5, mm10-Hsp90aa1 
##     mm10-Eif5a, mm10-Ncl, mm10-Gapdh, mm10-2810417H13Rik, mm10-H2afz, mm10-Ran, mm10-Lgals1, mm10-Anxa2, mm10-U2af1, mm10-Nop56 
## Negative:  mm10-mt-Co1, mm10-mt-Atp6, mm10-Lpl, mm10-mt-Nd1, mm10-Mgst1, mm10-mt-Cytb, mm10-Sepp1, mm10-Fabp4, mm10-Aplp2, mm10-Col6a3 
##     mm10-Pank3, mm10-Col6a1, mm10-Col6a2, mm10-Cd36, mm10-Idh1, mm10-Acsl1, mm10-Pcx, mm10-Adipoq, mm10-Rarres2, mm10-Adig 
##     mm10-Cidec, mm10-Plin1, mm10-C3, mm10-Retn, mm10-Ghr, mm10-Cebpa, mm10-G0s2, mm10-Scd2, mm10-Cox14, mm10-Ap3s1 
## PC_ 2 
## Positive:  mm10-Cycs, mm10-Fabp5, mm10-Mrpl12, mm10-Adipoq, mm10-Plin1, mm10-Acsl1, mm10-Retn, mm10-Cyc1, mm10-Tmem120a, mm10-Cidec 
##     mm10-Ak2, mm10-Cd36, mm10-Adig, mm10-Idh3a, mm10-G0s2, mm10-Mmd, mm10-Agpat9, mm10-Dgat2, mm10-Mrap, mm10-Uqcrq 
##     mm10-Hspd1, mm10-Nhp2, mm10-Mrpl20, mm10-Ran, mm10-Gpd1, mm10-Wfdc21, mm10-Emc6, mm10-Atp5b, mm10-Phb2, mm10-Grpel1 
## Negative:  mm10-Ptn, mm10-Cxcl12, mm10-Col3a1, mm10-Col1a1, mm10-Lox, mm10-Gas1, mm10-Postn, mm10-Mt1, mm10-Aspn, mm10-Mmp2 
##     mm10-Dclk1, mm10-Sned1, mm10-Cdh11, mm10-Ifitm3, mm10-Cst3, mm10-Gas6, mm10-Mgp, mm10-Npc2, mm10-Fn1, mm10-Zfp36l1 
##     mm10-Snai2, mm10-Pdgfra, mm10-Gas7, mm10-Adamts1, mm10-Nrep, mm10-Maf, mm10-Rps4x, mm10-Cped1, mm10-Gm13305, mm10-Malat1 
## PC_ 3 
## Positive:  mm10-Cd63, mm10-Ubb, mm10-Prdx1, mm10-Ppia, mm10-B2m, mm10-Pcolce, mm10-Cst3, mm10-Atp5h, mm10-Serpinf1, mm10-Tspo 
##     mm10-Itm2b, mm10-Ptn, mm10-Psmb6, mm10-Psmb2, mm10-Ndufa4, mm10-Psmb3, mm10-H3f3b, mm10-Cd81, mm10-Atp5o, mm10-Etfb 
##     mm10-Sparc, mm10-Ndufa11, mm10-Psmb5, mm10-Anxa2, mm10-Psma2, mm10-Ifi27, mm10-Ndufs3, mm10-Lxn, mm10-Dlk1, mm10-Tecr 
## Negative:  mm10-Ppp1r14b, mm10-Ftl1, mm10-Cdc34, mm10-Hras, mm10-Hmga2, mm10-Rps27l, mm10-Mpp6, mm10-Odc1, mm10-Mmd, mm10-Tomm5 
##     mm10-Pmepa1, mm10-Snhg9, mm10-Setd8, mm10-Ube2s, mm10-Neat1, mm10-Gm26909, mm10-Tnnt2, mm10-Klhdc8a, mm10-Cenpa, mm10-Hmga1 
##     mm10-Myh9, mm10-Ercc1, mm10-Lce1g, mm10-Zfp787, mm10-Lockd, mm10-Bdnf, mm10-Cmtm4, mm10-Jade1, mm10-Zfos1, mm10-Itga6 
## PC_ 4 
## Positive:  mm10-Top2a, mm10-Spc25, mm10-Ube2c, mm10-Nusap1, mm10-Hmgb2, mm10-Ccna2, mm10-Cenpf, mm10-Prc1, mm10-Cks2, mm10-Ccnb1 
##     mm10-Sgol1, mm10-H2afx, mm10-Mxd3, mm10-Smc4, mm10-Cdk1, mm10-Aurka, mm10-Aurkb, mm10-2810417H13Rik, mm10-Smc2, mm10-Ckap2l 
##     mm10-Cdkn2d, mm10-Cenpm, mm10-Fam64a, mm10-Tpx2, mm10-Pbk, mm10-Birc5, mm10-Depdc1a, mm10-Incenp, mm10-Cdca8, mm10-Tubb5 
## Negative:  mm10-Lgals3, mm10-Gsto1, mm10-Prr13, mm10-Anxa1, mm10-Aldoa, mm10-Ctsl, mm10-Esd, mm10-Ly6a, mm10-Lgals1, mm10-Ass1 
##     mm10-Rps6, mm10-Dstn, mm10-Capg, mm10-Anxa8, mm10-Ftl1, mm10-S100a6, mm10-Phlda1, mm10-Tmsb4x, mm10-Mt2, mm10-Bdnf 
##     mm10-Cyr61, mm10-Hbegf, mm10-Ldha, mm10-Pkm, mm10-Ptgs2, mm10-Gapdh, mm10-Thbs1, mm10-Sqstm1, mm10-Txn1, mm10-Timp1 
## PC_ 5 
## Positive:  mm10-Ldhb, mm10-Atp5k, mm10-Atp5j2, mm10-Chchd2, mm10-Ndufa4, mm10-Tnnt2, mm10-Mdh2, mm10-Psma5, mm10-Atp5h, mm10-Cenpa 
##     mm10-Fkbp3, mm10-Hspa8, mm10-Peg10, mm10-Gchfr, mm10-Rasl11a, mm10-Cnih2, mm10-Atp5o, mm10-Pmpcb, mm10-Suclg1, mm10-Gsta4 
##     mm10-Ncl, mm10-Phb2, mm10-Sparcl1, mm10-Cox8a, mm10-Fabp4, mm10-Hspb8, mm10-Ddx39, mm10-Uqcr11, mm10-Khk, mm10-Psmb7 
## Negative:  mm10-Col1a1, mm10-Thbs1, mm10-Prss23, mm10-Acta2, mm10-Tpm1, mm10-Flna, mm10-Wisp2, mm10-Eif4ebp1, mm10-Sparc, mm10-Cnn2 
##     mm10-Atf5, mm10-Actg2, mm10-Wisp1, mm10-Thy1, mm10-Ifitm3, mm10-Fbn1, mm10-Myl9, mm10-Tmsb4x, mm10-Cdc42ep3, mm10-Trib3 
##     mm10-Csrp1, mm10-Dstn, mm10-Fhl1, mm10-Tagln, mm10-Cav1, mm10-Tpm2, mm10-Chac1, mm10-Fhl2, mm10-Ly6a, mm10-Slit2
print(current_data[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  mm10-S100a6, mm10-Hmga2, mm10-Tuba1b, mm10-Tnfrsf12a, mm10-Npm1 
## Negative:  mm10-mt-Co1, mm10-mt-Atp6, mm10-Lpl, mm10-mt-Nd1, mm10-Mgst1 
## PC_ 2 
## Positive:  mm10-Cycs, mm10-Fabp5, mm10-Mrpl12, mm10-Adipoq, mm10-Plin1 
## Negative:  mm10-Ptn, mm10-Cxcl12, mm10-Col3a1, mm10-Col1a1, mm10-Lox 
## PC_ 3 
## Positive:  mm10-Cd63, mm10-Ubb, mm10-Prdx1, mm10-Ppia, mm10-B2m 
## Negative:  mm10-Ppp1r14b, mm10-Ftl1, mm10-Cdc34, mm10-Hras, mm10-Hmga2 
## PC_ 4 
## Positive:  mm10-Top2a, mm10-Spc25, mm10-Ube2c, mm10-Nusap1, mm10-Hmgb2 
## Negative:  mm10-Lgals3, mm10-Gsto1, mm10-Prr13, mm10-Anxa1, mm10-Aldoa 
## PC_ 5 
## Positive:  mm10-Ldhb, mm10-Atp5k, mm10-Atp5j2, mm10-Chchd2, mm10-Ndufa4 
## Negative:  mm10-Col1a1, mm10-Thbs1, mm10-Prss23, mm10-Acta2, mm10-Tpm1
VizDimLoadings(current_data, dims = 1:2, reduction = "pca")

DimPlot(current_data, reduction = "pca")

DimHeatmap(current_data, dims = 1, cells = 500, balanced = TRUE)

DimHeatmap(current_data, dims = 1:15, cells = 500, balanced = TRUE)

current_data <- JackStraw(current_data, num.replicate = 100)
current_data <- ScoreJackStraw(current_data, dims = 1:20)
JackStrawPlot(current_data, dims = 1:15)
## Warning: Removed 21000 rows containing missing values (`geom_point()`).

ElbowPlot(current_data)

if (curr_spec == "human"){housekeeping = "hg19-PPIA"} #HPRT
if (curr_spec == "mouse"){housekeeping = "mm10-Ppia"}

counts = current_data@assays$RNA@scale.data
cells_d0 = rownames(current_data@meta.data[current_data@meta.data$orig.ident == "Day 0",])
day_0_counts = counts[,colnames(counts) %in% cells_d0]
avg_housekeeping_d0 = day_0_counts[rownames(day_0_counts) %in% housekeeping,]

cells_final_day = rownames(current_data@meta.data[!current_data@meta.data$orig.ident == "Day 0",])
day_f_counts = counts[,colnames(counts) %in% cells_final_day]
avg_housekeeping_f = day_f_counts[rownames(day_f_counts) %in% housekeeping,]

min_val <- min(min(avg_housekeeping_d0), min(avg_housekeeping_f))
max_val <- max(max(avg_housekeeping_d0), max(avg_housekeeping_f))
par(mfrow=c(1,2)) # Set the plotting layout to two plots side-by-side
boxplot(avg_housekeeping_d0, main="Day 0", xlab=sprintf("Housekeeping gene %s - PPIA", curr_spec), ylab="Average logNorm counts", ylim = c(min_val, max_val))

boxplot(avg_housekeeping_f, main="Final Day", xlab=sprintf("Housekeeping gene %s - PPIA", curr_spec), ylab="Average logNorm counts", ylim = c(min_val, max_val))

current_data <- FindNeighbors(current_data, dims = 1:10)
## Computing nearest neighbor graph
## Computing SNN
current_data <- FindClusters(current_data, resolution = 0.09)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 7644
## Number of edges: 238819
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9548
## Number of communities: 3
## Elapsed time: 14 seconds
head(Idents(current_data), 5)
## Day 5_AAACCTGAGACTACAA-1 Day 5_AAACCTGAGCTCCTTC-1 Day 5_AAACCTGAGGTACTCT-1 
##                        2                        1                        2 
## Day 5_AAACCTGCAGCCTATA-1 Day 5_AAACCTGGTGTGCGTC-1 
##                        1                        1 
## Levels: 0 1 2
current_data <- RunTSNE(current_data, dims = 1:10)
DimPlot(current_data, reduction = "tsne")

current_data <- RunUMAP(current_data, dims = 1:10)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 19:10:00 UMAP embedding parameters a = 0.9922 b = 1.112
## 19:10:00 Read 7644 rows and found 10 numeric columns
## 19:10:00 Using Annoy for neighbor search, n_neighbors = 30
## 19:10:00 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 19:10:05 Writing NN index file to temp file /tmp/RtmpFLlU87/file16d927f6ba71
## 19:10:05 Searching Annoy index using 1 thread, search_k = 3000
## 19:10:39 Annoy recall = 100%
## 19:10:40 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 19:10:41 Initializing from normalized Laplacian + noise (using irlba)
## 19:10:42 Commencing optimization for 500 epochs, with 302464 positive edges
## 19:11:38 Optimization finished
DimPlot(current_data, reduction = "umap")

#-------save current_data as R object for easier referencing + GEO upload --------
current_data_R = sprintf("current_data.%s.Rds", curr_spec)
if (!file.exists(current_data_R)) {saveRDS(current_data, file = current_data_R)}
library(cowplot)
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:patchwork':
## 
##     align_plots
current_data.markers <- FindAllMarkers(current_data, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
top_markers <- current_data.markers %>%
  group_by(cluster) %>%
  slice_max(n = 4, order_by = avg_log2FC)

# Create a list of plots for the top 3 markers in each cluster
cluster_ids <- unique(top_markers$cluster)
plots_list <- lapply(cluster_ids, function(cluster_id) {
  marker_genes <- top_markers %>%
    filter(cluster == cluster_id) %>%
    pull(gene)
  
  p <- FeaturePlot(current_data, features = marker_genes)
  return(p)
})
plots_list[[1]]

plots_list[[2]]

plots_list[[3]]

current_data.markers %>%
  group_by(cluster) %>%
  top_n(n = 10, wt = avg_log2FC) -> top10
DoHeatmap(current_data, features = top10$gene) + NoLegend()

markers_csv = sprintf("current_data.markers.%s.csv", curr_spec)
if (!file.exists(markers_csv)) {write.csv(current_data.markers, file = markers_csv)}
library(slingshot)
## Loading required package: princurve
## Loading required package: TrajectoryUtils
## Loading required package: SingleCellExperiment
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## 
## Attaching package: 'SummarizedExperiment'
## The following object is masked from 'package:SeuratObject':
## 
##     Assays
## The following object is masked from 'package:Seurat':
## 
##     Assays
library(RColorBrewer)
pal <- c(RColorBrewer::brewer.pal(9, "Set1"), RColorBrewer::brewer.pal(8, "Set2"))
dimred = current_data@reductions$umap@cell.embeddings
clustering = current_data$RNA_snn_res.0.09
counts = as.matrix(current_data@assays$RNA@counts[current_data@assays$RNA@var.features,])
set.seed(1)
lineages <- slingshot::getLineages(data = dimred,
                        clusterLabels = clustering,
                        start.clus = "0") #define where to start the trajectories
# Plot the lineages
par(mfrow = c(1, 2))
plot(dimred[, 1:2], col = pal[clustering], cex = 0.5, pch = 16, main = paste(curr_spec,"Cluster Labels"))
for (i in levels(clustering)) {
    text(mean(dimred[clustering == i, 1]), mean(dimred[clustering == i, 2]), labels = i, font = 2)
}
plot(dimred[, 1:2], col = pal[clustering], cex = 0.5, pch = 16, main =  paste(curr_spec,"Pseudotime"))
lines(SlingshotDataSet(lineages), lwd = 3, col = "black")

library(eulerr)
library(stringr)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following object is masked from 'package:BiocGenerics':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
human.markers = read.csv(sprintf("current_data.markers.%s.csv", "human"))
mouse.markers = read.csv(sprintf("current_data.markers.%s.csv", "mouse"))
total_markers = rbind(human.markers, mouse.markers) %>% 
  dplyr::mutate(species = str_extract(gene, "^[^-]+"),
                gene_name = str_extract(gene, "(?<=-)[^[:space:]]+"), # extract the second wildcard
                gene_name = toupper(gene_name)) %>%
  dplyr::select(cluster, species, gene, gene_name)
plots_list <- list()
for (cluster_num in unique(total_markers$cluster)){
  cluster_markers <- total_markers[total_markers$cluster == cluster_num,]
  species_sets <- list(
    human = unique(cluster_markers$gene_name[cluster_markers$species == "hg19"]),
    mouse = unique(cluster_markers$gene_name[cluster_markers$species == "mm10"])
  )
  venn <- euler(species_sets, shape = "ellipse", fills = c("#00A0B0", "#CC333F"), quantities = TRUE)
  p = plot(venn, main = paste0("Cluster ", cluster_num), counts = TRUE, quantities = TRUE, cex = 1.5, lwd = 2, fontface = "bold")
  plots_list[[cluster_num + 1]] <- p
}
grid.arrange(grobs = plots_list, ncol = 3)

#making sure mouse and human gene format is the same
mouse.markers = total_markers[total_markers$species == 'mm10',]
ensembl <- useMart("ensembl", dataset = "mmusculus_gene_ensembl")
ensembl_ids <- getBM(attributes = c("ensembl_gene_id", "external_gene_name", "hsapiens_homolog_associated_gene_name"),
                     filters = "external_gene_name",
                     values = toupper(mouse.markers$gene_name),
                     mart = ensembl) 
ensembl_ids_filtered <- ensembl_ids %>%
  filter(!is.na(hsapiens_homolog_associated_gene_name) & nzchar(hsapiens_homolog_associated_gene_name)) %>%
  mutate(external_gene_name = toupper(external_gene_name))
mouse.markers = merge(mouse.markers, ensembl_ids_filtered, by.x = "gene_name", by.y = "external_gene_name")